This initial code chunk serves as the main configuration panel for
the entire analysis. It starts by clearing the environment to ensure a
clean session. The most important step here is selecting which
professional group to analyze from the local_names vector.
The results of the entire script will be specific to this selection.
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
# Clear all objects from the current R session to ensure a clean start.
rm(list=ls())
local_names <- c("machine and vehicle technology professions","medical health professions" ,
"professions in business management and organization","mechatronics, energy, and electrical professions" ,
"sales professions","all professions" )
### Which profession should be analyzed? ###
profession_selected = local_names[6]
This section merges several datasets containing driving and public transport times between cities. These data were collected at different stages of the project. This consolidation step creates a single, comprehensive data frame (distances_relevant) with the most up-to-date travel times.
Note for replication: If you were to re-collect the travel time data from scratch, this manual merging process would not be necessary. You could proceed directly with your single, clean dataset.
# Load the main prepared dataset, which includes the list of relevant city pairs.
load(file="./DataBackups/newest_prepared_data.Rdata")
distances_important <- distances_relevant
# Load various distance/time files collected at different times.
load("./DataBackups/distances_AB_2023.Rdata")
distances_new <- distances_relevant
distances_new$commuting=distances_new$driving_nov
distances_new$driving= distances_new$commuting_nov
column_names <- colnames(distances_new)
load("./DataBackups/distances_AB_with_times_november.Rdata")
distances_nov <- distances_relevant
load("./DataBackups/distances.RData")
distances_july <- distances_relevant
# Combine the different data sources into one data frame.
# Use the minimum time from the different sources, assuming the fastest route is the most relevant.
distances_relevant <- distances_july
distances_relevant$commuting_nov <- distances_nov$commuting_nov
distances_relevant$driving_nov <- distances_nov$driving_nov
distances_relevant <- distances_relevant[,column_names]
distances_relevant <- rbind(distances_relevant, distances_new)
distances_relevant$commuting_jul <- distances_relevant$commuting
distances_relevant$driving_jul <- distances_relevant$driving
distances_relevant$commuting <- pmin(distances_relevant$commuting_jul,distances_relevant$commuting_nov)
distances_relevant$driving <- pmin(distances_relevant$driving_jul,distances_relevant$driving_nov)
# Cap commuting time at a maximum of twice the driving time as a plausibility check.
distances_relevant$commuting[distances_relevant$commuting > 2*distances_relevant$driving]<- 2*distances_relevant$driving[distances_relevant$commuting > 2*distances_relevant$driving]
# Clean up and merge with the main city pair data.
distances_relevant <- distances_relevant[,c(1,4,8:13)]
distances_relevant <- merge(distances_important, distances_relevant, by=c("city...1","city...4"))
distances_relevant<-distances_relevant[,c(1,3,4,2,5,6,7:13)]
names(distances_relevant)[10] <- "public"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(sfheaders)
# Calculate distance ranks and filter to ensure all cities and districts are in the final dataset.
distances_relevant <- distances_relevant %>% group_by(city...1) %>% arrange(-dist) %>% mutate(rank=rank(dist))
distances_relevant <- distances_relevant %>% filter(city...1 %in% final$city) %>% filter(city...4 %in% final$city)
distances_relevant <- distances_relevant %>% filter(ABDistrict...3 %in% final$ABDistrict) %>% filter(ABDistrict...6 %in% final$ABDistrict)
rm(list=c("distances_new","distances_july","distances_nov","distances_important"))
For some calculations, it is necessary to have a ādistance to selfā entry for each city (i.e., a distance of zero). This chunk creates these self-referential rows and adds them to the main distance data frame.
# Create a data frame of self-to-self city pairs with zero distance/time.
distances_relevant_self <- unique(cbind(distances_relevant[,1:3],distances_relevant[,1:3],0,0,0,0,0,0,0,0))
## New names:
## ⢠`city...1` -> `city...4`
## ⢠`geometry...2` -> `geometry...5`
## ⢠`ABDistrict...3` -> `ABDistrict...6`
## ⢠`` -> `...7`
## ⢠`` -> `...8`
## ⢠`` -> `...9`
## ⢠`` -> `...10`
## ⢠`` -> `...11`
## ⢠`` -> `...12`
## ⢠`` -> `...13`
## ⢠`` -> `...14`
distances_relevant_self <- distances_relevant_self[!duplicated(distances_relevant_self[,1:3]),]
colnames(distances_relevant_self) <- colnames(distances_relevant)
distances_relevant <- rbind(distances_relevant_self, distances_relevant)
This section prepares the final analysis dataset by merging regional indicators and then explores the relationships between these variables through correlation plots and maps.
The regional indicators are originally at the administrative district (Kreise) level. This chunk aggregates them up to the labor agency district (AB-District) level using population-weighted averages to match the main dataset.
#### Calculate Dataset for evaluation
library(sfheaders)
library(corrplot)
## corrplot 0.92 loaded
Professions <- unique(data$Profession)
Professions
## [1] "machine and vehicle technology professions"
## [2] "medical health professions"
## [3] "professions in business management and organization"
## [4] "mechatronics, energy, and electrical professions"
## [5] "sales professions"
## [6] "all professions"
Tabelle_Kreise$Kennziffer<-as.integer(Tabelle_Kreise$Kennziffer)
profession <- as.data.frame(data)[,1:16]
profession_test <- profession[!is.na(profession$OpenPositions),]
# Aggregate the regional indicators from administrative to labor agency district level.
# Most variables are aggregated using a population-weighted mean.
Tabelle_Kreise$area_share <- 1/(Tabelle_Kreise$density ) * Tabelle_Kreise$`Bevƶlkerung gesamt_2022`
Tabelle_Kreise_AB <- final %>% group_by(key) %>% as.data.frame() %>% select(key,ABDistrict) %>% unique()
Tabelle_Kreise <- merge(Tabelle_Kreise, Tabelle_Kreise_AB, by.x="Kennziffer", by.y="key")
Tabelle_Kreise$share_traffic <- Tabelle_Kreise$share_trafic
# Recalculate from adminstrative districts to Labour Agency districts
Tabelle_AB <- Tabelle_Kreise %>% mutate(inhabitants = `Bevƶlkerung gesamt_2022`) %>% group_by(ABDistrict) %>%
summarize(unemp25 = sum(`Arbeitslosenquote Jüngere`*inhabitants)/sum(inhabitants),
bigCo = sum(bigCo*inhabitants)/sum(inhabitants),
cohort = sum(cohort*inhabitants)/sum(inhabitants),
cons = sum(cons*inhabitants)/sum(inhabitants),
gdp = sum(gdp*inhabitants)/sum(inhabitants),
income = sum(income*inhabitants)/sum(inhabitants),
medCo = sum(medCo*inhabitants)/sum(inhabitants),
density = sum(density*area_share)/sum(area_share),
cars = sum(cars*inhabitants)/sum(inhabitants),
shareVET = sum(shareVET*inhabitants)/(sum(inhabitants)),
shareUNI = sum(shareUNI*inhabitants)/(sum(inhabitants)),
medSE = sum(medSE*inhabitants)/(sum(inhabitants)),
lowSE = sum(lowSE*inhabitants)/(sum(inhabitants)),
highSE = sum(as.numeric(highSE)*inhabitants)/sum(inhabitants),
urban_permeation=sum(urban_permeation*area_share)/sum(area_share),
area_share = sum(area_share),
share_traffic = sum(share_traffic*area_share)/sum(area_share),
inhabitants = sum(inhabitants/1000),
num_ram = n_distinct(ram),
rams = paste0(unique(ram), collapse = ";")
)
# Merge the aggregated regional data into the main profession data frame.
profession <- merge(profession,Tabelle_AB, by.x="region", by.y="ABDistrict" )
These chunks generate correlation matrices to check for multicollinearity among the regional control variables. This is an important step to inform model specification and avoid including highly correlated predictors in the same regression.
cormatrix=cor(profession[,c(17:35)], use="complete")
corrplot(cormatrix, method="number")
#print(cormatrix)
print(cormatrix[rowSums(abs(cormatrix)>0.5)>1,colSums(abs(cormatrix)>0.5)>1])
## bigCo cohort cons gdp income
## bigCo 1.00000000 0.25604006 -0.3552714 0.2923580 0.450870484
## cohort 0.25604006 1.00000000 -0.4414364 0.3019255 0.255752774
## cons -0.35527139 -0.44143638 1.0000000 -0.3161498 -0.434793909
## gdp 0.29235804 0.30192555 -0.3161498 1.0000000 0.668725046
## income 0.45087048 0.25575277 -0.4347939 0.6687250 1.000000000
## medCo 0.66108007 -0.02406495 -0.1158470 -0.1103339 0.003913158
## density 0.55873454 0.66876728 -0.5762521 0.4962969 0.579745931
## cars -0.48384894 -0.60449591 0.5542687 -0.1081675 -0.021590534
## shareUNI 0.55086946 0.30858461 -0.4611010 0.1762391 0.214069002
## medSE -0.27962198 -0.29542982 0.6192328 -0.1425332 -0.255300945
## highSE 0.24151553 0.41679888 -0.6190791 0.1296008 0.107690785
## urban_permeation 0.65887686 0.57259079 -0.5735019 0.3513459 0.444369337
## area_share -0.42881872 0.03933610 0.1796215 -0.1919430 -0.395181903
## inhabitants 0.04981916 0.89540821 -0.3392309 0.2710093 0.202123717
## medCo density cars shareUNI medSE
## bigCo 0.661080074 0.5587345 -0.48384894 0.5508695 -0.27962198
## cohort -0.024064952 0.6687673 -0.60449591 0.3085846 -0.29542982
## cons -0.115846986 -0.5762521 0.55426871 -0.4611010 0.61923277
## gdp -0.110333902 0.4962969 -0.10816754 0.1762391 -0.14253323
## income 0.003913158 0.5797459 -0.02159053 0.2140690 -0.25530095
## medCo 1.000000000 0.0836203 -0.27928130 0.2921439 -0.05656329
## density 0.083620301 1.0000000 -0.67522720 0.5011801 -0.37344206
## cars -0.279281301 -0.6752272 1.00000000 -0.5698160 0.43674241
## shareUNI 0.292143869 0.5011801 -0.56981601 1.0000000 -0.38207814
## medSE -0.056563295 -0.3734421 0.43674241 -0.3820781 1.00000000
## highSE 0.090124080 0.4445815 -0.54122826 0.4474222 -0.80922607
## urban_permeation 0.269761352 0.8883862 -0.70759174 0.4860467 -0.46585682
## area_share -0.174019259 -0.5350911 0.22296992 -0.2819633 0.18760381
## inhabitants -0.176366864 0.4804702 -0.42494808 0.1841255 -0.21502753
## highSE urban_permeation area_share inhabitants
## bigCo 0.24151553 0.6588769 -0.4288187 0.04981916
## cohort 0.41679888 0.5725908 0.0393361 0.89540821
## cons -0.61907910 -0.5735019 0.1796215 -0.33923091
## gdp 0.12960076 0.3513459 -0.1919430 0.27100932
## income 0.10769078 0.4443693 -0.3951819 0.20212372
## medCo 0.09012408 0.2697614 -0.1740193 -0.17636686
## density 0.44458149 0.8883862 -0.5350911 0.48047015
## cars -0.54122826 -0.7075917 0.2229699 -0.42494808
## shareUNI 0.44742221 0.4860467 -0.2819633 0.18412555
## medSE -0.80922607 -0.4658568 0.1876038 -0.21502753
## highSE 1.00000000 0.4748061 -0.1274681 0.33245568
## urban_permeation 0.47480606 1.0000000 -0.5097528 0.35302105
## area_share -0.12746815 -0.5097528 1.0000000 0.33727457
## inhabitants 0.33245568 0.3530210 0.3372746 1.00000000
####################################################################################
#Create a combined variable for medium and big companies.
profession$mbCo <- profession$medCo + profession$bigCo
#profession <- profession %>% select(-c(medCo,bigCo))
print(cormatrix[rowSums(abs(cormatrix)>0.6)>1,colSums(abs(cormatrix)>0.6)>1])
## bigCo cohort cons gdp income
## bigCo 1.00000000 0.25604006 -0.3552714 0.2923580 0.450870484
## cohort 0.25604006 1.00000000 -0.4414364 0.3019255 0.255752774
## cons -0.35527139 -0.44143638 1.0000000 -0.3161498 -0.434793909
## gdp 0.29235804 0.30192555 -0.3161498 1.0000000 0.668725046
## income 0.45087048 0.25575277 -0.4347939 0.6687250 1.000000000
## medCo 0.66108007 -0.02406495 -0.1158470 -0.1103339 0.003913158
## density 0.55873454 0.66876728 -0.5762521 0.4962969 0.579745931
## cars -0.48384894 -0.60449591 0.5542687 -0.1081675 -0.021590534
## medSE -0.27962198 -0.29542982 0.6192328 -0.1425332 -0.255300945
## highSE 0.24151553 0.41679888 -0.6190791 0.1296008 0.107690785
## urban_permeation 0.65887686 0.57259079 -0.5735019 0.3513459 0.444369337
## inhabitants 0.04981916 0.89540821 -0.3392309 0.2710093 0.202123717
## medCo density cars medSE highSE
## bigCo 0.661080074 0.5587345 -0.48384894 -0.27962198 0.24151553
## cohort -0.024064952 0.6687673 -0.60449591 -0.29542982 0.41679888
## cons -0.115846986 -0.5762521 0.55426871 0.61923277 -0.61907910
## gdp -0.110333902 0.4962969 -0.10816754 -0.14253323 0.12960076
## income 0.003913158 0.5797459 -0.02159053 -0.25530095 0.10769078
## medCo 1.000000000 0.0836203 -0.27928130 -0.05656329 0.09012408
## density 0.083620301 1.0000000 -0.67522720 -0.37344206 0.44458149
## cars -0.279281301 -0.6752272 1.00000000 0.43674241 -0.54122826
## medSE -0.056563295 -0.3734421 0.43674241 1.00000000 -0.80922607
## highSE 0.090124080 0.4445815 -0.54122826 -0.80922607 1.00000000
## urban_permeation 0.269761352 0.8883862 -0.70759174 -0.46585682 0.47480606
## inhabitants -0.176366864 0.4804702 -0.42494808 -0.21502753 0.33245568
## urban_permeation inhabitants
## bigCo 0.6588769 0.04981916
## cohort 0.5725908 0.89540821
## cons -0.5735019 -0.33923091
## gdp 0.3513459 0.27100932
## income 0.4443693 0.20212372
## medCo 0.2697614 -0.17636686
## density 0.8883862 0.48047015
## cars -0.7075917 -0.42494808
## medSE -0.4658568 -0.21502753
## highSE 0.4748061 0.33245568
## urban_permeation 1.0000000 0.35302105
## inhabitants 0.3530210 1.00000000
# Calculate a correlation matrix for a selected subset of variables.
cormatrix=cor(profession[,c(17,20,26,28,30)], use="complete")
corrplot(cormatrix, method="number")
#print(cormatrix)
print(cormatrix[rowSums(abs(cormatrix)>0.5)>1,colSums(abs(cormatrix)>0.5)>1])
## cons medSE highSE
## cons 1.0000000 0.6192328 -0.6190791
## medSE 0.6192328 1.0000000 -0.8092261
## highSE -0.6190791 -0.8092261 1.0000000
####################################################################################
#profession <- profession %>% select(-c(shareUNI,shareVET))
profession$mbCo <- profession$medCo + profession$bigCo
#profession <- profession %>% select(-c(medCo,bigCo))
print(cormatrix[rowSums(abs(cormatrix)>0.6)>1,colSums(abs(cormatrix)>0.6)>1])
## cons medSE highSE
## cons 1.0000000 0.6192328 -0.6190791
## medSE 0.6192328 1.0000000 -0.8092261
## highSE -0.6190791 -0.8092261 1.0000000
This map provides a visual exploration of the travel time data. It plots connections between cities to highlight areas that are well-connected by car, by public transport, or by both, based on specific time thresholds. This helps in understanding the underlying spatial structure of the labor market.
library(tidyr)
library(purrr)
library(ggpattern)
library(sf)
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# --- Convert coordinate pairs to sf LINESTRING objects for plotting ---
# For commuting connections:
d_test_commute <- distances_relevant %>% group_by(city...1) %>% mutate(rank_e=rank(dist)) %>% filter(driving_nov > 30 & commuting_nov < 50)
d_mat1_c <- as.data.frame(cbind(unlist(map(d_test_commute$geometry...2,1)),unlist(map(d_test_commute$geometry...2,2))))
d_mat2_c <- as.data.frame(cbind(unlist(map(d_test_commute$geometry...5,1)),unlist(map(d_test_commute$geometry...5,2))))
d_mat1_c$num = rep(1, nrow(d_mat1_c))
d_mat2_c$num = rep(2, nrow(d_mat2_c))
d_mat1_c$id=1:nrow(d_mat1_c)
d_mat2_c$id=1:nrow(d_mat2_c)
d_mat_c <- rbind(d_mat1_c,d_mat2_c)
setorder(d_mat_c,id,num)
connection_commute <- sfheaders::sf_linestring(
obj = d_mat_c, x = "V1", y = "V2" , linestring_id = "id" , keep = TRUE)
st_crs(connection_commute) <- "WGS84"
# For driving connections:
d_test_drive <- distances_relevant %>% group_by(city...1) %>% mutate(rank_e=rank(dist)) %>% filter(driving_nov < 30 & commuting_nov > 50)
d_mat1_d <- as.data.frame(cbind(unlist(map(d_test_drive$geometry...2,1)),unlist(map(d_test_drive$geometry...2,2))))
d_mat2_d <- as.data.frame(cbind(unlist(map(d_test_drive$geometry...5,1)),unlist(map(d_test_drive$geometry...5,2))))
d_mat1_d$num = rep(1, nrow(d_mat1_d))
d_mat2_d$num = rep(2, nrow(d_mat2_d))
d_mat1_d$id=1:nrow(d_mat1_d)
d_mat2_d$id=1:nrow(d_mat2_d)
d_mat_d <- rbind(d_mat1_d,d_mat2_d)
setorder(d_mat_d,id,num)
connection_drive <- sfheaders::sf_linestring(
obj = d_mat_d, x = "V1", y = "V2" , linestring_id = "id" , keep = TRUE)
st_crs(connection_drive) <- "WGS84"
# For connections good by both modes:
d_test <- distances_relevant %>% group_by(city...1) %>% mutate(rank_e=rank(dist)) %>% filter(driving_nov < 30 & commuting_nov < 60)
d_mat1 <- as.data.frame(cbind(unlist(map(d_test$geometry...2,1)),unlist(map(d_test$geometry...2,2))))
d_mat2 <- as.data.frame(cbind(unlist(map(d_test$geometry...5,1)),unlist(map(d_test$geometry...5,2))))
d_mat1$num = rep(1, nrow(d_mat1))
d_mat2$num = rep(2, nrow(d_mat2))
d_mat1$id=1:nrow(d_mat1)
d_mat2$id=1:nrow(d_mat2)
d_mat <- rbind(d_mat1,d_mat2)
setorder(d_mat,id,num)
# Combine the three line datasets for plotting.
connection_sf <- sfheaders::sf_linestring(
obj = d_mat, x = "V1", y = "V2" , linestring_id = "id" , keep = TRUE)
st_crs(connection_sf) <- "WGS84"
library(ggpattern)
library(gridpattern)
library(ggplot2)
library("magick")
## Linking to ImageMagick 6.9.11.60
## Enabled features: fontconfig, freetype, fftw, heic, lcms, pango, webp, x11
## Disabled features: cairo, ghostscript, raw, rsvg
## Using 4 threads
rm(list=c("d_mat1","d_mat2","d_test","d_mat", "examples"))
## Warning in rm(list = c("d_mat1", "d_mat2", "d_test", "d_mat", "examples")):
## object 'examples' not found
connection_commute$mode = "b) Public Transport < 60 min, Driving > 30 min"
connection_drive$mode = "a) Public Transport > 60 min, Driving < 30 min"
connection_sf$mode = "c) Public Transport < 60 min, Driving < 30 min"
#connection_commute$mode = "b) ĆPNV < 45 min, KfZ > 30 min"
#connection_drive$mode = "a) ĆPNV > 45 min, KfZ < 30 min"
#connection_sf$mode = "c) ĆPNV < 45 min, KfZ < 30 min"
connection_graphics <- rbind(connection_drive,connection_commute,connection_sf)
custom_disc_color_10 <- c(
"#0d6971", # Mix of Blue and Green
"#ffe4b5", # Light Orange
"#4b0082", # Darker Purple
"#1A2F80",
"#0056a0", # Darker Blue
"#35005f", # Mix of Violet and Blue
"#1e7d32", # Lighter Green
"#957a48", # Mix of Green and Red
"#ec7063", # Red
"#fda06b" # Orange
)
# --- Create the final map ---
map_cities <- ggplot() +
geom_sf(data = districts, aes(fill = parent_DWH), alpha = 0.2) +
geom_sf(data = connection_graphics, aes(color = mode), linewidth = 1) +
geom_sf(data = final, color = "#003268", size = 2, shape = 20) +
labs(color = "Travelling times by Transport Mode") +
guides(fill = FALSE) +
scale_color_manual(values = c("#AA4455", "#E69F00", "#96c11f")) +
theme_bw() +
theme(
legend.position = "bottom",
legend.text = element_text(size = 10),
legend.title = element_text(size = 11, face="bold"),
legend.key.size = unit(0.5, "cm"),
legend.box = "horizontal",
legend.box.just = "center",
legend.spacing.x = unit(0.5, "cm"),
axis.title = element_text(size = 10, face="bold"), # axis titles (x/y)
axis.text = element_text(size = 10)
) +
scale_fill_gradientn(colors = custom_colors(100), na.value="#d4d4d4") +
guides(
color = guide_legend(ncol = 1, title.position="top") # Ensure single column for legend items
) +
scale_fill_manual(values = custom_disc_color_10)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
map_cities
ggsave(
filename = "./Outputs/Figures/map_cities.png", # or .tiff / .pdf
plot = map_cities,
width = 16, # width in cm
height = 25,
units = "cm",
dpi = 900
)
This is the start of the core analysis. The script now filters the main data frame to include only the data for the profession selected in the configuration chunk at the beginning of the file.
#Create small visualisation of Profession based differences in share of matched positions.
library(ggplot2)
borders<-profession %>% mutate(share=NewContracts/(NewContracts+OpenPositions))
ggplot(borders) + geom_freqpoly(aes(x=share, color=Profession)) + theme_light() +
scale_colour_manual(values = custom_disc_color)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Filter the data to the selected profession.
profession <- profession[profession$Profession == profession_selected,]
#Define Regression Models and Variables
This section prepares the data and formulas for the regression analysis. It involves log-transforming variables as is standard for Cobb-Douglas matching functions and defining a set of model formulas to test different specifications.
# Add district ID and number of representative cities to the analysis data frame.
profession <- merge(profession, districts[,2], by="ID")
profession <- merge(profession,final %>% group_by(ABDistrict) %>% summarise(numCity=n()),by.x="region",by.y="ABDistrict")
library(spatialreg)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(spdep)
##
## Attaching package: 'spdep'
## The following objects are masked from 'package:spatialreg':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
library(jtools)
library(sf)
#profession$target = pmin(profession$AllApplicants,profession$OpenPositions, na.rm = TRUE)
# Ensure no zero new contracts, as this will be log-transformed.
profession <- profession[profession$NewContracts!=0,]
profession <- profession
# Log-transform variables .
profession$M <- log(profession$NewContracts)
profession$U <- log((profession$NewContracts + profession$AllApplicants))
profession$V <- log((profession$NewContracts + profession$OpenPositions))
profession$shareUNI <- log(profession$shareUNI)
profession$shareVET <- log(profession$shareVET)
profession$unemp25 <- log(profession$unemp25)
profession$bigCo<- log(profession$bigCo)
profession$medCo<- log(profession$medCo)
profession$mbCo<- log(profession$mbCo)
profession$income <- log(profession$income)
#profession$cars <- log(profession$cars)
profession$highSE <- log(profession$highSE)
profession$cons <- log(profession$cons)
profession$gdp <- log(profession$gdp)
profession$lowSE <- log(profession$lowSE)
profession$medSE <- log(profession$medSE)
profession$area <- as.numeric(st_area(st_sf((profession))))
# Define the target variable.
target = "M"
# --- Define the different model formulas to be tested ---
# 1. Base Model: Standard matching function with only U and V.
formula_base <- as.formula(paste0(target, " ~ U + V "))
# 2. Agglomeration Model: Base model plus a generic 'Agglomeration' term.
formula_agg <- as.formula(paste0(target, " ~ U + V + Agglomeration"))
# 3. Controls Model: Base model plus regional control variables.
formula_ctrl <- as.formula(paste0(target, " ~ U + V + unemp25 + cons + highSE + medSE "))
# 4. Full Model: Base model with both controls and a generic 'Agglomeration' term.
formula_ctrl_agg <- as.formula(paste0(target, " ~ U + V + unemp25 + cons + highSE + medSE + Agglomeration"))
This section tests a series of simple, attribute-based measures of agglomeration. Each measure is used as the Agglomeration variable in the regression formulas defined above, and its performance is evaluated.
model_base <- lm(formula_base, profession, na.action = na.exclude)
model_controls<- lm(formula_ctrl , profession, na.action = na.exclude)
# West East Difference
profession$Regionname <- "West"
profession$Regionname[as.numeric(profession$parentID)>899] <- "Ost"
profession$AgglomerationEast = (as.numeric(profession$Regionname == "Ost"))
profession$Agglomeration <- profession$AgglomerationEast
model_east_west <- lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_east_west <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
# Single City
profession$AgglomerationSingleCity <- (as.numeric((profession$numCity == 1) & (profession$ID != 214) & (profession$ID != 367) & (profession$ID != 391) ))
profession$Agglomeration <- profession$AgglomerationSingleCity
model_single_city_region <-lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_single_city_region <-lm(formula_ctrl_agg, profession, na.action = na.exclude)
# Share of Biggest City
final <- final %>% group_by(ABDistrict) %>% mutate(sample=sum(maxPopulation), numCity=n(), shareCity=max(maxPopulation)/max(all))
profession_agg_share<- unique(merge(profession,final, by.x ="region", by.y ="ABDistrict", all.x=TRUE)[,c("region","shareCity")])
profession<-merge(profession,profession_agg_share, by="region")
profession$AgglomerationShareCity <- log(profession$shareCity)
profession$Agglomeration <- profession$AgglomerationShareCity
model_share_cities <-lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_share_cities <-lm(formula_ctrl_agg, profession, na.action = na.exclude)
# Cohort Size
profession$AgglomerationCohort <- log(profession$cohort)
profession$Agglomeration <- profession$AgglomerationCohort
model_cohort <-lm(formula_agg, profession, na.action= na.exclude)
model_ctrl_cohort <-lm(formula_ctrl_agg, profession, na.action= na.exclude)
#Population
profession$AgglomerationPop <- log(profession$inhabitant)
profession$Agglomeration <- profession$AgglomerationPop
model_pop <-lm(formula_agg, profession, na.action= na.exclude)
model_ctrl_pop <-lm(formula_ctrl_agg,profession, na.action= na.exclude)
#Urban Permeation
profession$AgglomerationUrbanPermeation<- log(profession$urban_permeation)
profession$Agglomeration <- profession$AgglomerationUrbanPermeation
model_urban_permeation<-lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_urban_permeation<-lm(formula_ctrl_agg, profession, na.action = na.exclude)
#Urban NumRams
profession$AgglomerationNumRams <- log(profession$num_ram)
profession$Agglomeration <- profession$AgglomerationNumRams
model_num_ram <-lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_num_ram <-lm(formula_ctrl_agg, profession, na.action = na.exclude)
#Density
profession$AgglomerationDensity <- log(profession$density)
profession$Agglomeration <- (profession$AgglomerationDensity)
model_den <- lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_den <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
#Traffic Infrastructure
profession$AgglomerationTraffic <- log(profession$share_traffic)
profession$Agglomeration <- (profession$AgglomerationTraffic)
model_traffic <- lm(formula_agg, profession, na.action = na.exclude)
model_ctrl_traffic <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
export_summs(model_base, model_east_west, model_single_city_region, model_share_cities, model_cohort, model_pop, model_urban_permeation, model_num_ram, model_den, model_traffic, model.names = c("Base", "EW", "SC", "NC", "CO", "POP", "UP", "RAM", "DEN", "TI"), stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1) , number_format = "%.3f", statistics=c("logLik","AIC","BIC","N. obs." = "nobs"))
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
| Base | EW | SC | NC | CO | POP | UP | RAM | DEN | TI | |
|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | -0.210 *** | -0.222 *** | -0.220 *** | -0.196 **Ā | -0.175 **Ā | -0.210 *** | -0.210 *** | -0.211 *** | -0.200 **Ā | -0.240 *** |
| (0.054)Ā Ā Ā | (0.058)Ā Ā Ā | (0.055)Ā Ā Ā | (0.059)Ā Ā Ā | (0.057)Ā Ā Ā | (0.054)Ā Ā Ā | (0.057)Ā Ā Ā | (0.054)Ā Ā Ā | (0.061)Ā Ā Ā | (0.060)Ā Ā Ā | |
| U | 0.590 *** | 0.591 *** | 0.598 *** | 0.584 *** | 0.557 *** | 0.585 *** | 0.590 *** | 0.575 *** | 0.586 *** | 0.583 *** |
| (0.029)Ā Ā Ā | (0.029)Ā Ā Ā | (0.030)Ā Ā Ā | (0.031)Ā Ā Ā | (0.035)Ā Ā Ā | (0.032)Ā Ā Ā | (0.033)Ā Ā Ā | (0.030)Ā Ā Ā | (0.031)Ā Ā Ā | (0.030)Ā Ā Ā | |
| V | 0.419 *** | 0.420 *** | 0.413 *** | 0.425 *** | 0.445 *** | 0.421 *** | 0.420 *** | 0.436 *** | 0.422 *** | 0.432 *** |
| (0.030)Ā Ā Ā | (0.030)Ā Ā Ā | (0.031)Ā Ā Ā | (0.031)Ā Ā Ā | (0.033)Ā Ā Ā | (0.030)Ā Ā Ā | (0.032)Ā Ā Ā | (0.031)Ā Ā Ā | (0.031)Ā Ā Ā | (0.032)Ā Ā Ā | |
| Agglomeration | Ā Ā Ā Ā Ā Ā Ā Ā | 0.005Ā Ā Ā Ā | -0.008Ā Ā Ā Ā | 0.003Ā Ā Ā Ā | 0.011 ^Ā Ā | 0.005Ā Ā Ā Ā | 0.000Ā Ā Ā Ā | -0.014 ^Ā Ā | 0.002Ā Ā Ā Ā | -0.007Ā Ā Ā Ā |
| Ā Ā Ā Ā Ā Ā Ā Ā | (0.008)Ā Ā Ā | (0.009)Ā Ā Ā | (0.004)Ā Ā Ā | (0.006)Ā Ā Ā | (0.012)Ā Ā Ā | (0.005)Ā Ā Ā | (0.008)Ā Ā Ā | (0.007)Ā Ā Ā | (0.006)Ā Ā Ā | |
| logLik | 280.639Ā Ā Ā Ā | 280.834Ā Ā Ā Ā | 281.024Ā Ā Ā Ā | 280.805Ā Ā Ā Ā | 282.165Ā Ā Ā Ā | 280.732Ā Ā Ā Ā | 280.639Ā Ā Ā Ā | 282.315Ā Ā Ā Ā | 280.698Ā Ā Ā Ā | 281.348Ā Ā Ā Ā |
| AIC | -553.279Ā Ā Ā Ā | -551.668Ā Ā Ā Ā | -552.048Ā Ā Ā Ā | -551.610Ā Ā Ā Ā | -554.330Ā Ā Ā Ā | -551.464Ā Ā Ā Ā | -551.279Ā Ā Ā Ā | -554.631Ā Ā Ā Ā | -551.396Ā Ā Ā Ā | -552.695Ā Ā Ā Ā |
| BIC | -541.344Ā Ā Ā Ā | -536.750Ā Ā Ā Ā | -537.130Ā Ā Ā Ā | -536.692Ā Ā Ā Ā | -539.412Ā Ā Ā Ā | -536.546Ā Ā Ā Ā | -536.361Ā Ā Ā Ā | -539.713Ā Ā Ā Ā | -536.478Ā Ā Ā Ā | -537.777Ā Ā Ā Ā |
| N. obs. | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā |
| *** p < 0.001; ** p < 0.01; * p < 0.05; ^ p < 0.1. | ||||||||||
export_summs(model_base, model_controls, model_ctrl_east_west, model_ctrl_single_city_region, model_ctrl_share_cities, model_ctrl_cohort, model_ctrl_pop, model_ctrl_urban_permeation, model_ctrl_num_ram, model_ctrl_den, model_ctrl_traffic, model.names = c("Base", "Controls", "EW", "SC", "NC", "CO", "POP", "UP", "RAM", "DEN", "TI"), stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1) , number_format = "%.3f", statistics=c("logLik","AIC","BIC","N. obs." = "nobs"))
| Base | Controls | EW | SC | NC | CO | POP | UP | RAM | DEN | TI | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | -0.210 *** | -0.662 **Ā | -0.613 *Ā Ā | -0.669 **Ā | -0.630 **Ā | -0.556 *Ā Ā | -0.723 **Ā | -0.671 **Ā | -0.652 **Ā | -0.641 **Ā | -0.652 **Ā |
| (0.054)Ā Ā Ā | (0.216)Ā Ā Ā | (0.236)Ā Ā Ā | (0.217)Ā Ā Ā | (0.211)Ā Ā Ā | (0.219)Ā Ā Ā | (0.237)Ā Ā Ā | (0.215)Ā Ā Ā | (0.213)Ā Ā Ā | (0.216)Ā Ā Ā | (0.216)Ā Ā Ā | |
| U | 0.590 *** | 0.655 *** | 0.659 *** | 0.657 *** | 0.672 *** | 0.633 *** | 0.662 *** | 0.648 *** | 0.649 *** | 0.653 *** | 0.651 *** |
| (0.029)Ā Ā Ā | (0.040)Ā Ā Ā | (0.040)Ā Ā Ā | (0.040)Ā Ā Ā | (0.039)Ā Ā Ā | (0.041)Ā Ā Ā | (0.041)Ā Ā Ā | (0.040)Ā Ā Ā | (0.039)Ā Ā Ā | (0.040)Ā Ā Ā | (0.040)Ā Ā Ā | |
| V | 0.419 *** | 0.362 *** | 0.358 *** | 0.360 *** | 0.340 *** | 0.375 *** | 0.362 *** | 0.364 *** | 0.370 *** | 0.360 *** | 0.370 *** |
| (0.030)Ā Ā Ā | (0.039)Ā Ā Ā | (0.040)Ā Ā Ā | (0.040)Ā Ā Ā | (0.039)Ā Ā Ā | (0.039)Ā Ā Ā | (0.040)Ā Ā Ā | (0.039)Ā Ā Ā | (0.039)Ā Ā Ā | (0.039)Ā Ā Ā | (0.040)Ā Ā Ā | |
| unemp25 | Ā Ā Ā Ā Ā Ā Ā Ā | -0.010Ā Ā Ā Ā | -0.012Ā Ā Ā Ā | -0.011Ā Ā Ā Ā | -0.019 *Ā Ā | -0.011Ā Ā Ā Ā | -0.009Ā Ā Ā Ā | -0.013Ā Ā Ā Ā | -0.011Ā Ā Ā Ā | -0.010Ā Ā Ā Ā | -0.010Ā Ā Ā Ā |
| Ā Ā Ā Ā Ā Ā Ā Ā | (0.008)Ā Ā Ā | (0.009)Ā Ā Ā | (0.009)Ā Ā Ā | (0.009)Ā Ā Ā | (0.008)Ā Ā Ā | (0.008)Ā Ā Ā | (0.009)Ā Ā Ā | (0.008)Ā Ā Ā | (0.008)Ā Ā Ā | (0.008)Ā Ā Ā | |
| cons | Ā Ā Ā Ā Ā Ā Ā Ā | 0.019Ā Ā Ā Ā | 0.015Ā Ā Ā Ā | 0.022Ā Ā Ā Ā | 0.046 ^Ā Ā | 0.031Ā Ā Ā Ā | 0.020Ā Ā Ā Ā | 0.028Ā Ā Ā Ā | 0.028Ā Ā Ā Ā | 0.029Ā Ā Ā Ā | 0.018Ā Ā Ā Ā |
| Ā Ā Ā Ā Ā Ā Ā Ā | (0.024)Ā Ā Ā | (0.025)Ā Ā Ā | (0.026)Ā Ā Ā | (0.026)Ā Ā Ā | (0.025)Ā Ā Ā | (0.025)Ā Ā Ā | (0.025)Ā Ā Ā | (0.024)Ā Ā Ā | (0.025)Ā Ā Ā | (0.024)Ā Ā Ā | |
| highSE | Ā Ā Ā Ā Ā Ā Ā Ā | 0.017Ā Ā Ā Ā | 0.012Ā Ā Ā Ā | 0.017Ā Ā Ā Ā | 0.009Ā Ā Ā Ā | 0.003Ā Ā Ā Ā | 0.022Ā Ā Ā Ā | 0.015Ā Ā Ā Ā | 0.013Ā Ā Ā Ā | 0.012Ā Ā Ā Ā | 0.014Ā Ā Ā Ā |
| Ā Ā Ā Ā Ā Ā Ā Ā | (0.025)Ā Ā Ā | (0.026)Ā Ā Ā | (0.025)Ā Ā Ā | (0.024)Ā Ā Ā | (0.025)Ā Ā Ā | (0.026)Ā Ā Ā | (0.024)Ā Ā Ā | (0.024)Ā Ā Ā | (0.025)Ā Ā Ā | (0.025)Ā Ā Ā | |
| medSE | Ā Ā Ā Ā Ā Ā Ā Ā | 0.076 *Ā Ā | 0.070 *Ā Ā | 0.075 *Ā Ā | 0.069 *Ā Ā | 0.057 ^Ā Ā | 0.083 *Ā Ā | 0.078 *Ā Ā | 0.067 *Ā Ā | 0.072 *Ā Ā | 0.071 *Ā Ā |
| Ā Ā Ā Ā Ā Ā Ā Ā | (0.031)Ā Ā Ā | (0.032)Ā Ā Ā | (0.031)Ā Ā Ā | (0.030)Ā Ā Ā | (0.031)Ā Ā Ā | (0.033)Ā Ā Ā | (0.030)Ā Ā Ā | (0.031)Ā Ā Ā | (0.031)Ā Ā Ā | (0.031)Ā Ā Ā | |
| Agglomeration | Ā Ā Ā Ā Ā Ā Ā Ā | Ā Ā Ā Ā Ā Ā Ā Ā | 0.005Ā Ā Ā Ā | 0.004Ā Ā Ā Ā | 0.014 **Ā | 0.013 *Ā Ā | -0.008Ā Ā Ā Ā | 0.010 ^Ā Ā | -0.016 *Ā Ā | 0.010Ā Ā Ā Ā | -0.006Ā Ā Ā Ā |
| Ā Ā Ā Ā Ā Ā Ā Ā | Ā Ā Ā Ā Ā Ā Ā Ā | (0.010)Ā Ā Ā | (0.010)Ā Ā Ā | (0.005)Ā Ā Ā | (0.006)Ā Ā Ā | (0.013)Ā Ā Ā | (0.006)Ā Ā Ā | (0.007)Ā Ā Ā | (0.007)Ā Ā Ā | (0.006)Ā Ā Ā | |
| logLik | 280.639Ā Ā Ā Ā | 288.797Ā Ā Ā Ā | 288.935Ā Ā Ā Ā | 288.894Ā Ā Ā Ā | 292.896Ā Ā Ā Ā | 291.054Ā Ā Ā Ā | 289.003Ā Ā Ā Ā | 290.238Ā Ā Ā Ā | 291.143Ā Ā Ā Ā | 289.773Ā Ā Ā Ā | 289.284Ā Ā Ā Ā |
| AIC | -553.279Ā Ā Ā Ā | -561.593Ā Ā Ā Ā | -559.870Ā Ā Ā Ā | -559.789Ā Ā Ā Ā | -567.791Ā Ā Ā Ā | -564.107Ā Ā Ā Ā | -560.006Ā Ā Ā Ā | -562.477Ā Ā Ā Ā | -564.287Ā Ā Ā Ā | -561.545Ā Ā Ā Ā | -560.569Ā Ā Ā Ā |
| BIC | -541.344Ā Ā Ā Ā | -537.724Ā Ā Ā Ā | -533.017Ā Ā Ā Ā | -532.937Ā Ā Ā Ā | -540.939Ā Ā Ā Ā | -537.255Ā Ā Ā Ā | -533.153Ā Ā Ā Ā | -535.624Ā Ā Ā Ā | -537.434Ā Ā Ā Ā | -534.693Ā Ā Ā Ā | -533.716Ā Ā Ā Ā |
| N. obs. | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā | 146Ā Ā Ā Ā Ā Ā Ā Ā |
| *** p < 0.001; ** p < 0.01; * p < 0.05; ^ p < 0.1. | |||||||||||
This section moves beyond simple attributes to calculate agglomeration based on spatial proximity (distance, driving time, public transport time). A loop is used to find the āoptimalā spatial radius for each measure by iterating through different thresholds and selecting the one that maximizes the modelās log-likelihood.
# Merge distance data with population data for both origin and destination cities.
distances_relevant_pop <- merge(distances_relevant,final, by.x="city...1",by.y="city")
distances_relevant_pop <- merge(distances_relevant_pop,final,by.x="city...4",by.y="city")
distances_relevant_pop$dist<- distances_relevant_pop$dist/1000
This chunk tests agglomeration based on geographic distance, both with and without population weighting. I will not re-comment later chunkgs with the same structure.
#### Distances with population weighted
best_itt_dist_pop <- 0
best_val_dist_pop <- 0
profession$Agglomeration <-0
model_dist_pop <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_dist_pop <- NULL
best_model_ctrl_dist_pop <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationDistPop <- 0
distances_quantiles <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% select(dist) %>% filter(dist !=0)
# Loop through distance quantiles to find the best threshold
for(test in quantile(distances_quantiles$dist, probs = seq(0,1,0.1))[2:10])
{
# Calculate agglomeration score based on population reachable within the 'test' distance. Calculate for every District, all connections that are below the test threshold, and caculate for each city in this connection list, how many people can be reaches.
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(dist <= test) %>% group_by(city...1) %>% summarise(target_share=sum(maxPopulation.y)/max(sample.y), self_pop = mean(maxPopulation.x), sample=max(sample.x), district=max(ABDistrict.x), .groups="keep") %>% group_by(district) %>% summarise(Agglomeration=mean(target_share*sample), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration/1000 )
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="district", all.x=TRUE)
model_dist_pop<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_dist_pop))
# Run model and check if it's the best so far.
if(logLik(model_dist_pop) > best_val_dist_pop)
{ best_val_dist_pop <- logLik(model_dist_pop)
best_itt_dist_pop <- test
best_model_dist_pop <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_dist_pop <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationDistPop <- profession_new$Agglomeration
}
}
## 'log Lik.' 292.9767 (df=9)
## 'log Lik.' 291.5382 (df=9)
## 'log Lik.' 290.6723 (df=9)
## 'log Lik.' 290.913 (df=9)
## 'log Lik.' 291.4203 (df=9)
## 'log Lik.' 291.1578 (df=9)
## 'log Lik.' 291.0669 (df=9)
## 'log Lik.' 292.0228 (df=9)
## 'log Lik.' 292.5981 (df=9)
best_itt_dist <- 0
best_val_dist <- 0
#### Distances without population weighted
profession$Agglomeration <-0
model_dist <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_dist <- NULL
best_model_ctrl_dist <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationDist <- 0
distances_quantiles <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% select(dist) %>% filter(dist !=0)
for(test in quantile(distances_quantiles$dist, probs = seq(0,1,0.1))[2:10])
{
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(dist <= test) %>% group_by(ABDistrict...3) %>% summarise(Agglomeration=n()/(max(numCity.x)*max(numCity.x)), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration )
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="ABDistrict...3", all.x=TRUE)
model_dist<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_dist))
if(logLik(model_dist) > best_val_dist)
{ best_val_dist <- logLik(model_dist)
best_itt_dist <- test
best_model_dist <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_dist <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationDist <- profession_new$Agglomeration
}
}
## 'log Lik.' 290.6696 (df=9)
## 'log Lik.' 289.4748 (df=9)
## 'log Lik.' 288.9885 (df=9)
## 'log Lik.' 289.2434 (df=9)
## 'log Lik.' 289.5303 (df=9)
## 'log Lik.' 289.1082 (df=9)
## 'log Lik.' 288.9877 (df=9)
## 'log Lik.' 289.418 (df=9)
## 'log Lik.' 289.399 (df=9)
This chunk repeats the optimization process using driving time instead of geographic distance.
#### Distances with population weighted
best_itt_driving_pop <- 0
best_val_driving_pop <- 0
profession$Agglomeration <-0
model_driving_pop <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_driving_pop <- NULL
best_model_ctrl_driving_pop <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationDrivingPop <- 0
driving_quantiles <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% select(driving) %>% filter(driving !=0)
for(test in quantile(driving_quantiles$driving, probs = seq(0,1,0.1))[2:10])
{
test <- quantile(driving_quantiles$driving, probs = seq(0,1,0.1))[4]
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(driving <= test) %>% group_by(city...1) %>% summarise(target_share=sum(maxPopulation.y)/max(sample.y), self_pop = mean(maxPopulation.x), sample=max(sample.x), district=max(ABDistrict.x), .groups="keep") %>% group_by(district) %>% summarise(Agglomeration=mean(target_share*sample), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration /1000)
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="district", all.x=TRUE)
model_driving_pop<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_driving_pop))
if(logLik(model_driving_pop) > best_val_driving_pop)
{ best_val_driving_pop <- logLik(model_driving_pop)
best_itt_driving_pop <- test
best_model_driving_pop <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_driving_pop <- lm(formula_ctrl_agg, profession_new, na.action = na.exclude)
profession$AgglomerationDrivingPop <- profession_new$Agglomeration
}
}
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
## 'log Lik.' 291.8862 (df=9)
best_itt_driving <- 0
best_val_driving <- 0
#### Distances without population weighted
profession$Agglomeration <-0
model_driving <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_driving <- NULL
best_model_ctrl_driving <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationDriving <- 0
driving_quantiles <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% select(driving) %>% filter(driving !=0)
for(test in quantile(driving_quantiles$driving, probs = seq(0,1,0.1))[2:10])
{
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(driving <= test) %>% group_by(ABDistrict...3) %>% summarise(Agglomeration=n()/(max(numCity.x)*max(numCity.x)), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration )
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="ABDistrict...3", all.x=TRUE)
model_driving<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_driving))
if(logLik(model_driving) > best_val_driving)
{ best_val_driving <- logLik(model_driving)
best_itt_driving <- test
best_model_driving <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_driving <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationDriving <- profession_new$Agglomeration
}
}
## 'log Lik.' 290.0226 (df=9)
## 'log Lik.' 290.0495 (df=9)
## 'log Lik.' 289.816 (df=9)
## 'log Lik.' 289.2367 (df=9)
## 'log Lik.' 289.1443 (df=9)
## 'log Lik.' 289.003 (df=9)
## 'log Lik.' 288.8907 (df=9)
## 'log Lik.' 289.3197 (df=9)
## 'log Lik.' 289.5618 (df=9)
This chunk repeats the optimization process using public transport time.
profession$AgglomerationPublicPop49 <- 0
#### Distances with population weighted
best_itt_public_pop <- 0
best_val_public_pop <- 0
profession$Agglomeration <-0
model_public_pop <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_public_pop <- NULL
best_model_ctrl_public_pop <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationPublicPop <- 0
public_quantiles <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% select(public) %>% filter(public !=0)
for(test in quantile(public_quantiles$public, probs = seq(0,1,0.1))[2:10])
{
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(public <= test) %>% group_by(city...1) %>% summarise(target_share=sum(maxPopulation.y)/max(sample.y), self_pop = mean(maxPopulation.x), sample=max(sample.x), district=max(ABDistrict.x), .groups="keep") %>% group_by(district) %>% summarise(Agglomeration=mean(target_share*sample), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration /1000)
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="district", all.x=TRUE)
model_public_pop<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_public_pop))
# Saving the best-ffiting agglomeratino, also for other professison
if(test == quantile(public_quantiles$public, probs = seq(0,1,0.1))[5])
{
profession$AgglomerationPublicPop49 <- profession_new$Agglomeration
}
if(logLik(model_public_pop) > best_val_public_pop)
{ best_val_public_pop <- logLik(model_public_pop)
best_itt_public_pop <- test
best_model_public_pop <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_public_pop <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationPublicPop <- profession_new$Agglomeration
}
}
## 'log Lik.' 292.1303 (df=9)
## 'log Lik.' 293.4289 (df=9)
## 'log Lik.' 294.2921 (df=9)
## 'log Lik.' 294.8036 (df=9)
## 'log Lik.' 293.6843 (df=9)
## 'log Lik.' 292.7954 (df=9)
## 'log Lik.' 292.0441 (df=9)
## 'log Lik.' 292.3116 (df=9)
## 'log Lik.' 292.7055 (df=9)
best_itt_public <- 0
best_val_public <- 0
#### Distances without population weighted
profession$Agglomeration <-0
model_public <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_public <- NULL
best_model_ctrl_public <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationPublic <- 0
public_quantiles <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% select(public) %>% filter(public !=0)
for(test in quantile(public_quantiles$public, probs = seq(0,1,0.1))[2:10])
{
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(public <= test) %>% group_by(ABDistrict...3) %>% summarise(Agglomeration=n()/(max(numCity.x)*max(numCity.x)), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration )
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="ABDistrict...3", all.x=TRUE)
model_public<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_public))
if(logLik(model_public) > best_val_public)
{ best_val_public <- logLik(model_public)
best_itt_public <- test
best_model_public <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_public <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationPublic <- profession_new$Agglomeration
}
}
## 'log Lik.' 289.8518 (df=9)
## 'log Lik.' 290.7524 (df=9)
## 'log Lik.' 291.2594 (df=9)
## 'log Lik.' 291.1886 (df=9)
## 'log Lik.' 290.8449 (df=9)
## 'log Lik.' 290.2581 (df=9)
## 'log Lik.' 289.3861 (df=9)
## 'log Lik.' 289.4187 (df=9)
## 'log Lik.' 289.5331 (df=9)
This chunk calculates agglomeration based on the K-Nearest Neighbors, providing an alternative to distance/time thresholds. This section was not included in the final paper.
best_itt_knn_pop <- 0
best_val_knn_pop <- 0
profession$Agglomeration <-0
model_knn_pop <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_knn_pop <- NULL
best_model_knn_ctrl_pop <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationKnnPop <- 0
for(test in 1:10)
{
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(rank <= test) %>% group_by(city...1) %>% summarise(target_share=sum(maxPopulation.y)/max(sample.y), self_pop = mean(maxPopulation.x), sample=max(sample.x), district=max(ABDistrict.x), .groups="keep") %>% group_by(district) %>% summarise(Agglomeration=mean(target_share*sample), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration /1000)
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="district", all.x=TRUE)
model_knn_pop<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_knn_pop))
if(logLik(model_knn_pop) > best_val_knn_pop)
{ best_val_knn_pop <- logLik(model_knn_pop)
best_itt_knn_pop <- test
best_model_knn_pop <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_knn_pop <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationKnnPop <- profession_new$Agglomeration
}
}
## 'log Lik.' 294 (df=9)
## 'log Lik.' 295.1545 (df=9)
## 'log Lik.' 293.518 (df=9)
## 'log Lik.' 293.3743 (df=9)
## 'log Lik.' 293.8621 (df=9)
## 'log Lik.' 294.4633 (df=9)
## 'log Lik.' 293.6468 (df=9)
## 'log Lik.' 293.6696 (df=9)
## 'log Lik.' 293.8564 (df=9)
## 'log Lik.' 294.2084 (df=9)
best_itt_knn <- 0
best_val_knn <- 0
#### Distances without population weighted
profession$Agglomeration <-0
model_knn <- lm(formula_ctrl_agg, profession, na.action = na.exclude)
best_model_knn <- NULL
best_model_ctrl_knn <- NULL
profession <- profession %>% select(-Agglomeration)
profession$AgglomerationKnn <- 0
for(test in 1:10)
{
distances_relevant_test <- distances_relevant_pop %>% filter(ABDistrict...3 == ABDistrict...6) %>% filter(rank <= test) %>% group_by(ABDistrict...3) %>% summarise(Agglomeration=n()/(max(numCity.x)*max(numCity.x)), .groups="keep")
distances_relevant_test$Agglomeration <- log(distances_relevant_test$Agglomeration )
profession_new <- merge(profession,distances_relevant_test[,1:2], by.x ="region", by.y ="ABDistrict...3", all.x=TRUE)
model_knn<- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
print(logLik(model_knn))
if(logLik(model_knn) > best_val_knn)
{ best_val_knn <- logLik(model_knn)
best_itt_knn <- test
best_model_knn <- lm(formula_agg, profession_new, na.action=na.exclude)
best_model_ctrl_knn <- lm(formula_ctrl_agg, profession_new, na.action=na.exclude)
profession$AgglomerationKnn <- profession_new$Agglomeration
}
}
## 'log Lik.' 290.8421 (df=9)
## 'log Lik.' 291.5449 (df=9)
## 'log Lik.' 290.2167 (df=9)
## 'log Lik.' 290.3848 (df=9)
## 'log Lik.' 290.6251 (df=9)
## 'log Lik.' 291.095 (df=9)
## 'log Lik.' 290.2314 (df=9)
## 'log Lik.' 290.0614 (df=9)
## 'log Lik.' 290.3028 (df=9)
## 'log Lik.' 290.6402 (df=9)
This final part of the analysis section brings together all the results, creating comprehensive summary tables, visualizing key findings, and saving the final workspace.
library(huxtable)
##
## Attaching package: 'huxtable'
## The following object is masked from 'package:ggplot2':
##
## theme_grey
## The following object is masked from 'package:dplyr':
##
## add_rownames
library(formula.tools)
model_names <- c("Base", "EA-WE", "CITY", "CITY-SHARE", "COHO", "POP", "URB", "DEN", "TI","DIST","POP-DIST","DRIVE","POP-DRIVE","PUBLIC","POP-PUB","KNN","KNNP")
model_names_ctrl <- c("Base", "Controls", "EA-WE", "CITY", "CITY-SHARE", "COHO", "POP", "URB", "RAM", "DEN", "TI","DIST","POP-DIST","DRIVE","POP-DRIVE","PUBLIC","POP-PUB","KNN","KNNP")
table <- export_summs(model_base, model_east_west, model_single_city_region, model_share_cities, model_cohort, model_pop, model_urban_permeation, model_num_ram, model_den, model_traffic, best_model_dist, best_model_dist_pop, best_model_driving, best_model_driving_pop, best_model_public, best_model_public_pop, best_model_knn, best_model_knn_pop, model.names = model_names, stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1) , number_format = "%.3f", statistics=c("logLik","N. obs." = "nobs"))
table_ctrl <- export_summs(model_base, model_controls, model_ctrl_east_west, model_ctrl_single_city_region, model_ctrl_share_cities, model_ctrl_cohort, model_ctrl_pop, model_ctrl_urban_permeation, model_ctrl_num_ram, model_ctrl_den, model_ctrl_traffic,best_model_ctrl_dist, best_model_ctrl_dist_pop, best_model_ctrl_driving, best_model_ctrl_driving_pop, best_model_ctrl_public, best_model_ctrl_public_pop, best_model_ctrl_knn, best_model_ctrl_knn_pop,
model.names = model_names_ctrl , stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1) , number_format = "%.3f", statistics=c("logLik","N. obs." = "nobs"))
# Updated model names with the necessary variables in the specified order
output_model_names <- c("Base", "CITY", "CITY-SHARE", "EA-WE", "URB", "DEN", "COHO", "POP", "POP-DIST", "POP-DRIVE", "POP-PUBLIC")
output_model_names_ctrl <- c("Base", "Controls", "CITY", "CITY-SHARE", "EA-WE", "URB", "DEN", "COHO", "POP", "POP-DIST", "POP-DRIVE", "POP-PUBLIC")
# Reordered and sub-selected models
output_table <- export_summs(
model_base,
model_single_city_region,
model_share_cities,
model_east_west,
model_urban_permeation,
model_den,
model_cohort,
model_pop,
best_model_dist_pop,
best_model_driving_pop,
best_model_public_pop,
model.names = output_model_names,
stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1),
number_format = "%.3f",
statistics = c("logLik", "N. obs." = "nobs")
)
output_table_ctrl <- export_summs(
model_base,
model_controls,
model_ctrl_single_city_region,
model_ctrl_share_cities,
model_ctrl_east_west,
model_ctrl_urban_permeation,
model_ctrl_den,
model_ctrl_cohort,
model_ctrl_pop,
best_model_ctrl_dist_pop,
best_model_ctrl_driving_pop,
best_model_ctrl_public_pop,
model.names = output_model_names_ctrl,
stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1),
number_format = "%.3f",
statistics = c( "logLik", "AIC","BIC", "N. obs." = "nobs")
)
# Saving the HTML outputs
output_table %>% quick_html(file = paste0("./Outputs/Regressions/", profession_selected, target, "Agg.html"))
output_table_ctrl %>% quick_html(file = paste0("./Outputs/Regressions/", profession_selected, target, "ControlAgg.html"))
model_names_new <- c("BASE", "COHO", "POP-DRIVE", "POP-PUBLIC", "CONTROL", "COHO", "POP-DRIVE", "POP-PUBLIC")
# Sub-selected models in the specified order
table_new <- export_summs(
model_base,
model_cohort,
best_model_driving_pop,
best_model_public_pop,
model_controls,
model_ctrl_cohort,
best_model_ctrl_driving_pop,
best_model_ctrl_public_pop,
number_format = "%.3f",
model.names = model_names_new,
stars = c(`***` = 0.001, `**` = 0.01, `*` = 0.05, `^` = 0.1),
coefs = names(best_model_ctrl_public_pop$coefficients), # Agglomeration placed last
statistics = c("logLik", "N. obs." = "nobs") # Adjusted R-squared removed
) %>%
set_font_size(10) %>%
set_tb_padding(1) %>%
set_italic(final(1), 1)
# Saving the HTML output
table_new %>% quick_html(file = paste0("./Outputs/Regressions/", profession_selected, target, "SelectedModels.html"))
This chunk uses the coefficients from the best-performing model (best_model_ctrl_public_pop) to calculate a ālocal matching efficiencyā score for each district. This score, which represents the constant term of the matching function adjusted for local characteristics, is then visualized on a map.
estimate_vector<- best_model_ctrl_public_pop$coefficients[-c(2,3,9,10,11)]
estimate_values <- as.matrix(cbind("(Intercept)"=1, st_drop_geometry(profession_new[,names(estimate_vector[-1])])))
profession_new$Efficiency <- exp(estimate_values %*% estimate_vector)
map_efficiency <- ggplot() +
geom_sf(data=st_as_sf(profession_new), aes(fill = Efficiency))+
theme(legend.text.align = 1) +
labs(fill = "Efficiency") +
guides(
fill = guide_colorbar(direction = "horizontal", title.position="top", barwidth = 20, # Adjust this value to change the width of the colorbar
barheight = 1 ),
# Adjust rows for the fill legend
) +
theme(
legend.position = "bottom",
legend.text = element_text(size = 18),
legend.title = element_text(size = 18, face="bold"),
legend.box = "horizontal",
legend.box.just = "center",
legend.spacing.x = unit(0.5, "cm")
)
## Warning: The `legend.text.align` argument of `theme()` is deprecated as of ggplot2
## 3.5.0.
## ā¹ Please use theme(legend.text = element_text(hjust)) instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#labs(shape = "Bevƶlkerung ", fill = "Defizit an Auszubildenden") +
# scale_fill_gradient2(high = "#96c11f", low = "#003268", mid = "white")
map_efficiency + scale_fill_gradientn(colors = custom_colors(100), na.value="#e4e4e4")
This plot shows the correlation between all the different agglomeration measures that were calculated. It helps to understand which concepts of agglomeration are similar and which are distinct. The matrix is sorted by average correlation to make it easier to interpret.
# Step 2: Select relevant columns
model_names
## [1] "Base" "EA-WE" "CITY" "CITY-SHARE" "COHO"
## [6] "POP" "URB" "DEN" "TI" "DIST"
## [11] "POP-DIST" "DRIVE" "POP-DRIVE" "PUBLIC" "POP-PUB"
## [16] "KNN" "KNNP"
agglo_cols <- grep("Agglomeration", names(profession), value = TRUE)
agglo_cols <- agglo_cols[c(1:6,8:9,11,10,13,12,15,14,17,16)]
agglo_data <- profession[, agglo_cols]
#1] "Base" "EA-WE" "CITY" "CITY-N" "COHO" "POP" "URB" "DEN" "TI"
#[10] "DIST" "POP-DIST" "DRIVE" "POP-DRIVE" "PUBLIC" "POP-PUB" "KNN" "KNNP"
model_names
## [1] "Base" "EA-WE" "CITY" "CITY-SHARE" "COHO"
## [6] "POP" "URB" "DEN" "TI" "DIST"
## [11] "POP-DIST" "DRIVE" "POP-DRIVE" "PUBLIC" "POP-PUB"
## [16] "KNN" "KNNP"
colnames(agglo_data)<-model_names[-1]
# Step 3: Create a correlation matrix
cor_matrix <- cor(agglo_data, use = "complete.obs")
row.names(cor_matrix) <- model_names[-1]
colnames(cor_matrix) <- model_names[-1]
avg_cor <- rowMeans(abs(cor_matrix))
# Step 5: Sort the variables based on the average correlation
sorted_indices <- order(avg_cor, decreasing = TRUE)
sorted_cor_matrix <- cor_matrix[sorted_indices, sorted_indices]
library(corrplot)
#custom_names <- c("CustomName1", "CustomName2", "CustomName3") # Update with your custom names
#custom_names_sorted <- custom_names[sorted_indices]
#names(sorted_cor_matrix) <- custom_names_sorted
#colnames(sorted_cor_matrix) <- custom_names_sorted
#rownames(sorted_cor_matrix) <- custom_names_sorted
# Step 6: Create the corplot
corrplot(sorted_cor_matrix, method = "circle", type = "full", tl.col = "black", tl.srt = 45, col=rev(custom_colors(100)))
This chunk creates a detailed descriptive statistics table for the main agglomeration measures, including short descriptions of what each measure represents.
library(dplyr)
library(huxtable)
# Define a function to calculate the required statistics for a column
calculate_stats <- function(x, name) {
stats <- data.frame(
Variable = name,
Min = min(x, na.rm = TRUE),
Max = max(x, na.rm = TRUE),
Mean = mean(x, na.rm = TRUE),
SD = sd(x, na.rm = TRUE),
Q1 = quantile(x, 0.25, na.rm = TRUE),
Median = quantile(x, 0.5, na.rm = TRUE),
Q3 = quantile(x, 0.75, na.rm = TRUE)
)
return(stats)
}
agglo_data <- exp(agglo_data)
agglo_data$CITY <- log(agglo_data$CITY)
# Calculate statistics for each column and bind the results into a single data frame
agglo_stats <- bind_rows(lapply(names(agglo_data), function(name) {
calculate_stats(agglo_data[[name]], name)
}))
selected_values <- c("CITY", "CITY-SHARE", "URB", "DEN",
"COHO", "POP", "POP-DIST", "POP-DRIVE", "POP-PUB")
# Filter and arrange
agglo_stats <- agglo_stats %>%
filter(Variable %in% selected_values) %>%
arrange(factor(Variable, levels = selected_values))
agglo_stats$Name <- c("Single City Region",
"Frequency of Cities",
"Urban Permeation",
"Population Density",
"Cohort Size",
"Population",
"Population Distance Weighted",
"Population Driving Weighted",
"Population Public Transport Weighted")
agglo_stats$Description <- c(
"Indicator for being a Single City Labour Market",
"Share of population that are covered by the biggest city within a region",
"Urban Permeation (Jeager et al, 2010). It measures the intensity and spread of buildings in the landscape simultenously.",
"Population per area covered (in 1000 / km²)",
"Number of Adolescents (in 1000) between 16 and 22 years",
"Population (in 1000) of a certain region.",
" Population (in 1000) that can be reached by an average citizen within 22.3 km distance.",
" Population (in 1000) that can be reached by an average citizen within 43 minutes of car driving",
" Population (in 1000) that can be reached by an average citizen within 49.3 minutes of public transport.")
# Sort the statistics by the Mean column
#agglo_stats_sorted <- agglo_stats %>% arrange(Mean)
agglo_stats <- agglo_stats[c(1,9,10,2:8)]
# Create the huxtable
ht <- as_hux(agglo_stats)
ht <- ht %>%
set_all_borders(value = 1) %>%
set_all_border_colors(value = "black") %>%
set_all_border_styles(value = "solid") %>%
# set_background_color(even = TRUE, everywhere = TRUE, value = "lightgray") %>%
set_background_color(row = 1, value = "lightgray") %>%
set_background_color(col = 1, value = "lightgray") %>%
set_header_rows(1, TRUE) %>%
set_header_cols(1, TRUE) %>%
set_wrap(col = 1, value = FALSE)
# Add light gray lines between rows and columns
# Display the huxtable as HTML
ht%>% quick_html(file=paste0("./Outputs/Regressions/OutputStatisticsAgg.html"))
This final chunk cleans the R environment by removing all intermediate objects, leaving only the essential final data frames and model results. It then saves the clean workspace to a new .Rdata file, named dynamically based on the profession that was analyzed.
all_vars <- ls(envir = .GlobalEnv)
# Pattern to match
pattern <- "best_val"
matching_vars <- grep(pattern, all_vars, value = TRUE)
# Create a list of matching variable names and their values
variable_list <- lapply(matching_vars, function(var_name) {
value <- get(var_name, envir = .GlobalEnv)
list(name = var_name, value = value)
})
# Convert the list to a named list where names are the variable names
best_value_list <- setNames(variable_list, matching_vars)
pattern <- "model"
matching_vars <- grep(pattern, all_vars, value = TRUE)
# Create a list of matching variable names and their valuesbb
variable_list <- lapply(matching_vars, function(var_name) {
value <- get(var_name, envir = .GlobalEnv)
list(name = var_name, value = value)
})
# Convert the list to a named list where names are the variable names
best_list <- setNames(variable_list, matching_vars)
rm(list=ls(pattern="best_val"))
rm(list=ls(pattern="best_itt"))
rm(list=ls(pattern="model"))
rm(list=c("all_vars", "borders", "column_names", "connection_commute", "connection_drive", "connection_graphics", "connection_sf", "cormatrix", "d_mat_c", "d_mat_d", "d_mat1_c", "d_mat1_d", "d_mat2_c", "d_mat2_d", "d_test_commute", "d_test_drive", "data_impute", "df_filtered", "distances_relevant_pop", "distances_relevant_self", "distances_relevant_test", "driving_quantiles", "map_cities", "map_deficit", "matching_vars", "pattern", "profession_test", "professions", "Professions", "Tabelle_AB", "Tabelle_Kreise", "table", "table_ctrl", "variable_list"))
save.image(file=paste0("./DataBackups/",profession_selected,target,".Rdata"))